sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Sonoma 14.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/Los_Angeles
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods
## [7] base
##
## other attached packages:
## [1] knitr_1.45
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.33 R6_2.5.1 fastmap_1.1.1
## [4] xfun_0.41 cachem_1.0.8 htmltools_0.5.7
## [7] rmarkdown_2.25 lifecycle_1.0.4 cli_3.6.2
## [10] sass_0.4.8 jquerylib_0.1.4 compiler_4.3.2
## [13] rstudioapi_0.15.0 tools_4.3.2 evaluate_0.23
## [16] bslib_0.6.1 yaml_2.3.8 formatR_1.14
## [19] rlang_1.1.2 jsonlite_1.8.8
getwd()
## [1] "/Users/heerpatel/Desktop/MGSC-310"
library("tidyverse")
library("readr")
library("dplyr")
library("ggplot2")
library("ggthemes")
library("ggrepel")
library("forcats")
library("formatR")
library("rsample")
library("purrr")
library("plotROC")
library("glmnet")
library("glmnetUtils")
library("yardstick")
library("corrplot")
library("tidymodels")
library(sjPlot)
Adding the dataset
songs <- read.csv("datasets/Spotify_Youtube.csv")
names(songs)
## [1] "X" "Artist" "Url_spotify"
## [4] "Track" "Album" "Album_type"
## [7] "Uri" "Danceability" "Energy"
## [10] "Key" "Loudness" "Speechiness"
## [13] "Acousticness" "Instrumentalness" "Liveness"
## [16] "Valence" "Tempo" "Duration_ms"
## [19] "Url_youtube" "Title" "Channel"
## [22] "Views" "Likes" "Comments"
## [25] "Description" "Licensed" "official_video"
## [28] "Stream"
print(sapply(songs, class))
## X Artist Url_spotify Track
## "integer" "character" "character" "character"
## Album Album_type Uri Danceability
## "character" "character" "character" "numeric"
## Energy Key Loudness Speechiness
## "numeric" "numeric" "numeric" "numeric"
## Acousticness Instrumentalness Liveness Valence
## "numeric" "numeric" "numeric" "numeric"
## Tempo Duration_ms Url_youtube Title
## "numeric" "numeric" "character" "character"
## Channel Views Likes Comments
## "character" "numeric" "numeric" "numeric"
## Description Licensed official_video Stream
## "character" "character" "character" "numeric"
Removing missing values
songs <- songs %>%
na.omit()
missing_values <- songs %>%
summarise_all(~sum(is.na(.)))
print(missing_values[missing_values > 0])
## integer(0)
remove outliers
remove_outliers <- function(variable) {
q1 <- quantile(variable, 0.25)
q3 <- quantile(variable, 0.75)
iqr <- q3 - q1
lower_bound <- q1 - 1.5 * iqr
upper_bound <- q3 + 1.5 * iqr
return(variable >= lower_bound & variable <= upper_bound)
}
numeric_vars <- sapply(songs, is.numeric)
your_data_frame_no_outliers <- songs
for (var in names(songs)[numeric_vars]) {
your_data_frame_no_outliers <- your_data_frame_no_outliers[remove_outliers(your_data_frame_no_outliers[[var]]),
]
}
Cleaning the dataset (variables)
songs_clean <- songs %>%
mutate(Album_type = as.factor(Album_type), Licensed = as.factor(Licensed),
official_video = as.factor(official_video)) %>%
select(-Url_spotify, -Uri, -Url_youtube, -Description, -X) %>%
mutate(popular = if_else(Stream > 100000000, 1, 0), popular = as.factor(popular)) %>%
mutate(Channel = as.factor(Channel), channel_factor = fct_lump_n(Channel,
n = 10)) %>%
mutate(Artist = as.factor(Artist)) %>%
mutate(log_likes = log(Likes)) %>%
mutate(log_dance = log(Danceability))
levels(songs_clean$channel_factor)
## [1] "Atlantic Records" "DisneyMusicVEVO" "RHINO"
## [4] "SMTOWN" "Sony Music India" "SonyMusicIndiaVEVO"
## [7] "SonyMusicSouthVEVO" "T-Series" "YRF"
## [10] "Zee Music Company" "Other"
songs_clean$Speechiness <- round(songs_clean$Speechiness, 3)
songs_clean$Instrumentalness <- round(songs_clean$Instrumentalness, 3)
songs_clean$Acousticness <- round(songs_clean$Acousticness, 3)
songs_clean$Liveness <- round(songs_clean$Liveness, 3)
songs_clean$Danceability <- round(songs_clean$Danceability, 3)
songs_clean$Energy <- round(songs_clean$Energy, 3)
songs_clean$Valence <- round(songs_clean$Valence, 3)
glimpse(songs_clean)
## Rows: 19,549
## Columns: 27
## $ Artist <fct> "Gorillaz", "Gorillaz", "Gorillaz", "Gorill…
## $ Track <chr> "Feel Good Inc.", "Rhinestone Eyes", "New G…
## $ Album <chr> "Demon Days", "Plastic Beach", "New Gold (f…
## $ Album_type <fct> album, album, single, album, album, album, …
## $ Danceability <dbl> 0.818, 0.676, 0.695, 0.689, 0.663, 0.760, 0…
## $ Energy <dbl> 0.705, 0.703, 0.923, 0.739, 0.694, 0.891, 0…
## $ Key <dbl> 6, 8, 1, 2, 10, 11, 4, 11, 2, 10, 9, 4, 9, …
## $ Loudness <dbl> -6.679, -5.815, -3.930, -5.810, -8.627, -5.…
## $ Speechiness <dbl> 0.177, 0.030, 0.052, 0.026, 0.171, 0.037, 0…
## $ Acousticness <dbl> 0.008, 0.087, 0.043, 0.000, 0.025, 0.023, 0…
## $ Instrumentalness <dbl> 0.002, 0.001, 0.047, 0.509, 0.000, 0.087, 0…
## $ Liveness <dbl> 0.613, 0.046, 0.116, 0.064, 0.070, 0.298, 0…
## $ Valence <dbl> 0.772, 0.852, 0.551, 0.578, 0.525, 0.966, 0…
## $ Tempo <dbl> 138.559, 92.761, 108.014, 120.423, 167.953,…
## $ Duration_ms <dbl> 222640, 200173, 215150, 233867, 340920, 245…
## $ Title <chr> "Gorillaz - Feel Good Inc. (Official Video)…
## $ Channel <fct> "Gorillaz", "Gorillaz", "Gorillaz", "Gorill…
## $ Views <dbl> 693555221, 72011645, 8435055, 211754952, 61…
## $ Likes <dbl> 6220896, 1079128, 282142, 1788577, 6197318,…
## $ Comments <dbl> 169907, 31003, 7399, 55229, 155930, 72008, …
## $ Licensed <fct> True, True, True, True, True, True, False, …
## $ official_video <fct> True, True, True, True, True, True, True, F…
## $ Stream <dbl> 1040234854, 310083733, 63063467, 434663559,…
## $ popular <fct> 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1…
## $ channel_factor <fct> Other, Other, Other, Other, Other, Other, O…
## $ log_likes <dbl> 15.643425, 13.891664, 12.550166, 14.396931,…
## $ log_dance <dbl> -0.2008929, -0.3915622, -0.3638434, -0.3725…
More removing outliers
Q1 <- quantile(songs_clean$Likes, 0.25)
Q3 <- quantile(songs_clean$Likes, 0.75)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
songs_clean <- songs_clean[songs_clean$Likes >= lower_bound & songs_clean$Likes <=
upper_bound, ]
songs_clean <- songs_clean[complete.cases(songs_clean$log_likes) & is.finite(songs_clean$log_likes),
]
songs_clean <- songs_clean[complete.cases(songs_clean$log_dance) & is.finite(songs_clean$log_dance),
]
Splitting the dataset into testing and training datasets
songs_split <- initial_split(songs_clean, prop = 0.75)
songs_train <- training(songs_split)
songs_test <- testing(songs_split)
Version 5
lin_mod5 <- lm(log_likes ~ Danceability + Licensed + official_video + Valence +
channel_factor + Loudness + Liveness, data = songs_train)
# Check the summary of the model
summary(lin_mod5)
##
## Call:
## lm(formula = log_likes ~ Danceability + Licensed + official_video +
## Valence + channel_factor + Loudness + Liveness, data = songs_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.775 -1.074 0.301 1.411 7.214
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 9.568407 0.480019 19.933
## Danceability 0.933487 0.125882 7.416
## LicensedTrue 0.330246 0.065996 5.004
## official_videoTrue 1.490195 0.072101 20.668
## Valence -0.373176 0.082741 -4.510
## channel_factorDisneyMusicVEVO 1.725819 0.611499 2.822
## channel_factorRHINO 0.369808 0.568757 0.650
## channel_factorSMTOWN 2.144931 0.725793 2.955
## channel_factorSony Music India 1.614380 0.627562 2.572
## channel_factorSonyMusicIndiaVEVO 1.956641 0.539356 3.628
## channel_factorSonyMusicSouthVEVO 1.278830 0.538172 2.376
## channel_factorT-Series 1.874958 0.510177 3.675
## channel_factorYRF 2.497900 0.616816 4.050
## channel_factorZee Music Company 1.784397 0.584305 3.054
## channel_factorOther 0.549426 0.470567 1.168
## Loudness 0.112633 0.004175 26.976
## Liveness -0.399363 0.106853 -3.738
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## Danceability 0.000000000000129 ***
## LicensedTrue 0.000000568804402 ***
## official_videoTrue < 0.0000000000000002 ***
## Valence 0.000006535726002 ***
## channel_factorDisneyMusicVEVO 0.004776 **
## channel_factorRHINO 0.515572
## channel_factorSMTOWN 0.003129 **
## channel_factorSony Music India 0.010109 *
## channel_factorSonyMusicIndiaVEVO 0.000287 ***
## channel_factorSonyMusicSouthVEVO 0.017504 *
## channel_factorT-Series 0.000239 ***
## channel_factorYRF 0.000051594129649 ***
## channel_factorZee Music Company 0.002264 **
## channel_factorOther 0.242997
## Loudness < 0.0000000000000002 ***
## Liveness 0.000187 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.994 on 12766 degrees of freedom
## Multiple R-squared: 0.2166, Adjusted R-squared: 0.2156
## F-statistic: 220.6 on 16 and 12766 DF, p-value: < 0.00000000000000022
# Plotting the coefficients with the standard error
tab_model(lin_mod5)
| Â | log_likes | ||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | 9.57 | 8.63 – 10.51 | <0.001 |
| Danceability | 0.93 | 0.69 – 1.18 | <0.001 |
| Licensed [True] | 0.33 | 0.20 – 0.46 | <0.001 |
| official video [True] | 1.49 | 1.35 – 1.63 | <0.001 |
| Valence | -0.37 | -0.54 – -0.21 | <0.001 |
|
channel factor [DisneyMusicVEVO] |
1.73 | 0.53 – 2.92 | 0.005 |
| channel factor [RHINO] | 0.37 | -0.75 – 1.48 | 0.516 |
| channel factor [SMTOWN] | 2.14 | 0.72 – 3.57 | 0.003 |
|
channel factor [Sony Music India] |
1.61 | 0.38 – 2.84 | 0.010 |
|
channel factor [SonyMusicIndiaVEVO] |
1.96 | 0.90 – 3.01 | <0.001 |
|
channel factor [SonyMusicSouthVEVO] |
1.28 | 0.22 – 2.33 | 0.018 |
| channel factor [T-Series] | 1.87 | 0.87 – 2.87 | <0.001 |
| channel factor [YRF] | 2.50 | 1.29 – 3.71 | <0.001 |
|
channel factor [Zee Music Company] |
1.78 | 0.64 – 2.93 | 0.002 |
| channel factor [Other] | 0.55 | -0.37 – 1.47 | 0.243 |
| Loudness | 0.11 | 0.10 – 0.12 | <0.001 |
| Liveness | -0.40 | -0.61 – -0.19 | <0.001 |
| Observations | 12783 | ||
| R2 / R2 adjusted | 0.217 / 0.216 | ||
plot_model(lin_mod5)
tidy(lin_mod5)
Generating the predictions
log_preds_train <- predict(lin_mod5, newdata = songs_train)
preds_train <- exp(log_preds_train)
log_preds_test <- predict(lin_mod5, newdata = songs_test)
preds_test <- exp(log_preds_test)
Calculating RMSE in the test and training sets
get_rmse <- function(true, predictions) {
sqrt(mean((true - predictions)^2))
}
get_rmse(songs_train$Likes, preds_train)
## [1] 304032.7
get_rmse(songs_test$Likes, preds_test)
## [1] 309089.6
Generating prediction/true plots
results_train <- tibble(preds = preds_train, true = songs_train$Likes,
type = "train")
results_test <- tibble(preds = preds_test, true = songs_test$Likes, type = "test")
results_df <- bind_rows(results_train, results_test)
ggplot(results_df, aes(x = true, y = preds)) + geom_point(aes(color = type),
alpha = 1/10) + geom_abline(color = "purple") + facet_wrap(~type) +
xlim(0, 1500000) + ylim(0, 500000) + theme_clean(base_size = 8) + theme(legend.position = "bottom")
Calculating MAE
get_mae <- function(true, predictions) {
mean(abs(true - predictions))
}
MAE_train <- get_mae(results_test$true, results_test$preds)
MAE_test <- get_mae(results_train$true, results_train$preds)
print(MAE_train)
## [1] 180000.2
print(MAE_test)
## [1] 177188
MAE_data <- data.frame(MAE_plot <- c("MAE train", "MAE test"), MAE_plot1 <- c(MAE_train,
MAE_test))
plot1 <- ggplot(MAE_data, aes(x = MAE_plot, y = MAE_plot1, fill = MAE_plot)) +
geom_bar(stat = "identity") + geom_text(aes(label = MAE_plot1), position = position_stack(vjust = 0.5),
size = 3) + labs(title = "Mean Absolute Error", x = "MAE_plot", y = "MAE_plot1") +
theme_clean()
print(plot1)
Checking correlation
numeric_data <- songs_clean[, sapply(songs_clean, is.numeric)]
correlation_matrix <- cor(numeric_data)
print(round(correlation_matrix, 2))
## Danceability Energy Key Loudness Speechiness
## Danceability 1.00 0.24 0.04 0.36 0.23
## Energy 0.24 1.00 0.03 0.75 0.09
## Key 0.04 0.03 1.00 0.03 0.02
## Loudness 0.36 0.75 0.03 1.00 0.06
## Speechiness 0.23 0.09 0.02 0.06 1.00
## Acousticness -0.28 -0.67 -0.04 -0.56 -0.10
## Instrumentalness -0.32 -0.32 0.00 -0.55 -0.11
## Liveness -0.08 0.18 -0.01 0.09 0.07
## Valence 0.47 0.39 0.04 0.32 0.06
## Tempo -0.07 0.16 0.00 0.14 0.04
## Duration_ms -0.09 0.03 0.00 0.01 -0.05
## Views 0.06 0.10 0.01 0.15 -0.05
## Likes 0.11 0.10 0.02 0.18 0.02
## Comments 0.05 0.12 0.01 0.14 0.01
## Stream 0.03 0.03 -0.01 0.09 -0.04
## log_likes 0.14 0.18 0.03 0.29 -0.04
## log_dance 0.96 0.29 0.04 0.43 0.20
## Acousticness Instrumentalness Liveness Valence Tempo
## Danceability -0.28 -0.32 -0.08 0.47 -0.07
## Energy -0.67 -0.32 0.18 0.39 0.16
## Key -0.04 0.00 -0.01 0.04 0.00
## Loudness -0.56 -0.55 0.09 0.32 0.14
## Speechiness -0.10 -0.11 0.07 0.06 0.04
## Acousticness 1.00 0.29 -0.05 -0.21 -0.13
## Instrumentalness 0.29 1.00 -0.06 -0.28 -0.08
## Liveness -0.05 -0.06 1.00 0.03 0.01
## Valence -0.21 -0.28 0.03 1.00 0.09
## Tempo -0.13 -0.08 0.01 0.09 1.00
## Duration_ms -0.03 0.00 0.00 -0.05 -0.02
## Views -0.08 -0.12 0.02 0.07 0.04
## Likes -0.10 -0.12 -0.01 0.03 0.04
## Comments -0.13 -0.09 0.00 0.01 0.03
## Stream -0.08 -0.09 -0.04 -0.01 0.01
## log_likes -0.17 -0.21 -0.01 0.06 0.05
## log_dance -0.31 -0.40 -0.06 0.48 -0.04
## Duration_ms Views Likes Comments Stream log_likes
## Danceability -0.09 0.06 0.11 0.05 0.03 0.14
## Energy 0.03 0.10 0.10 0.12 0.03 0.18
## Key 0.00 0.01 0.02 0.01 -0.01 0.03
## Loudness 0.01 0.15 0.18 0.14 0.09 0.29
## Speechiness -0.05 -0.05 0.02 0.01 -0.04 -0.04
## Acousticness -0.03 -0.08 -0.10 -0.13 -0.08 -0.17
## Instrumentalness 0.00 -0.12 -0.12 -0.09 -0.09 -0.21
## Liveness 0.00 0.02 -0.01 0.00 -0.04 -0.01
## Valence -0.05 0.07 0.03 0.01 -0.01 0.06
## Tempo -0.02 0.04 0.04 0.03 0.01 0.05
## Duration_ms 1.00 0.05 0.03 0.04 -0.01 0.05
## Views 0.05 1.00 0.82 0.60 0.33 0.57
## Likes 0.03 0.82 1.00 0.77 0.42 0.70
## Comments 0.04 0.60 0.77 1.00 0.28 0.55
## Stream -0.01 0.33 0.42 0.28 1.00 0.31
## log_likes 0.05 0.57 0.70 0.55 0.31 1.00
## log_dance -0.09 0.07 0.11 0.05 0.04 0.16
## log_dance
## Danceability 0.96
## Energy 0.29
## Key 0.04
## Loudness 0.43
## Speechiness 0.20
## Acousticness -0.31
## Instrumentalness -0.40
## Liveness -0.06
## Valence 0.48
## Tempo -0.04
## Duration_ms -0.09
## Views 0.07
## Likes 0.11
## Comments 0.05
## Stream 0.04
## log_likes 0.16
## log_dance 1.00
threshold <- 0.3
strong_correlation_indices <- which(abs(correlation_matrix) > threshold &
correlation_matrix != 1, arr.ind = TRUE)
for (i in 1:nrow(strong_correlation_indices)) {
row_index <- strong_correlation_indices[i, 1]
col_index <- strong_correlation_indices[i, 2]
correlation <- correlation_matrix[row_index, col_index]
print(paste("Variables:", colnames(correlation_matrix)[row_index],
"and", colnames(correlation_matrix)[col_index], "Correlation:",
round(correlation, 2)))
}
## [1] "Variables: Loudness and Danceability Correlation: 0.36"
## [1] "Variables: Instrumentalness and Danceability Correlation: -0.32"
## [1] "Variables: Valence and Danceability Correlation: 0.47"
## [1] "Variables: log_dance and Danceability Correlation: 0.96"
## [1] "Variables: Loudness and Energy Correlation: 0.75"
## [1] "Variables: Acousticness and Energy Correlation: -0.67"
## [1] "Variables: Instrumentalness and Energy Correlation: -0.32"
## [1] "Variables: Valence and Energy Correlation: 0.39"
## [1] "Variables: Danceability and Loudness Correlation: 0.36"
## [1] "Variables: Energy and Loudness Correlation: 0.75"
## [1] "Variables: Acousticness and Loudness Correlation: -0.56"
## [1] "Variables: Instrumentalness and Loudness Correlation: -0.55"
## [1] "Variables: Valence and Loudness Correlation: 0.32"
## [1] "Variables: log_dance and Loudness Correlation: 0.43"
## [1] "Variables: Energy and Acousticness Correlation: -0.67"
## [1] "Variables: Loudness and Acousticness Correlation: -0.56"
## [1] "Variables: log_dance and Acousticness Correlation: -0.31"
## [1] "Variables: Danceability and Instrumentalness Correlation: -0.32"
## [1] "Variables: Energy and Instrumentalness Correlation: -0.32"
## [1] "Variables: Loudness and Instrumentalness Correlation: -0.55"
## [1] "Variables: log_dance and Instrumentalness Correlation: -0.4"
## [1] "Variables: Danceability and Valence Correlation: 0.47"
## [1] "Variables: Energy and Valence Correlation: 0.39"
## [1] "Variables: Loudness and Valence Correlation: 0.32"
## [1] "Variables: log_dance and Valence Correlation: 0.48"
## [1] "Variables: Likes and Views Correlation: 0.82"
## [1] "Variables: Comments and Views Correlation: 0.6"
## [1] "Variables: Stream and Views Correlation: 0.33"
## [1] "Variables: log_likes and Views Correlation: 0.57"
## [1] "Variables: Views and Likes Correlation: 0.82"
## [1] "Variables: Comments and Likes Correlation: 0.77"
## [1] "Variables: Stream and Likes Correlation: 0.42"
## [1] "Variables: log_likes and Likes Correlation: 0.7"
## [1] "Variables: Views and Comments Correlation: 0.6"
## [1] "Variables: Likes and Comments Correlation: 0.77"
## [1] "Variables: log_likes and Comments Correlation: 0.55"
## [1] "Variables: Views and Stream Correlation: 0.33"
## [1] "Variables: Likes and Stream Correlation: 0.42"
## [1] "Variables: log_likes and Stream Correlation: 0.31"
## [1] "Variables: Views and log_likes Correlation: 0.57"
## [1] "Variables: Likes and log_likes Correlation: 0.7"
## [1] "Variables: Comments and log_likes Correlation: 0.55"
## [1] "Variables: Stream and log_likes Correlation: 0.31"
## [1] "Variables: Danceability and log_dance Correlation: 0.96"
## [1] "Variables: Loudness and log_dance Correlation: 0.43"
## [1] "Variables: Acousticness and log_dance Correlation: -0.31"
## [1] "Variables: Instrumentalness and log_dance Correlation: -0.4"
## [1] "Variables: Valence and log_dance Correlation: 0.48"
numeric_columns <- names(songs_clean)[sapply(songs_clean, is.numeric)]
# Calculate correlations
correlation_matrix <- cor(songs_clean[numeric_columns])
# Create a correlation heatmap
corr_heatmap <- corrplot(correlation_matrix, method = "color", addCoef.col = "black",
order = "hclust", tl.col = "black", tl.srt = 45, tl.cex = 0.7, number.cex = 0.5)
checking histograms
hist(songs_clean$Likes)
hist(songs_clean$Danceability)
hist(songs_clean$Valence)
hist(songs_clean$Loudness)
hist(songs_clean$Liveness)
Summary stats on our dependent variable Likes
summary(songs_clean$Likes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 16385 88364 217023 300520 1282925
likes_summary <- summary(songs_clean$Likes)
# Bar plot for mean and median
likes_summary <- summary(songs_clean$Likes)
# Bar plot for mean and median
barplot(c(likes_summary["Mean"], likes_summary["Median"]), col = c("lightblue",
"lightgreen"), main = "Mean and Median of Likes", ylab = "Count", names.arg = c("Mean",
"Median"), ylim = c(0, max(likes_summary["Mean"], likes_summary["Median"]) +
500000))
Observations for the variable Artist - find the top 15 artists based on number of likes received
# Most Viewed artist on Youtube
artists_yt <- songs_clean %>%
group_by(Artist) %>%
summarize(TotalViews = sum(Views)) %>%
arrange(desc(TotalViews)) %>%
head(15)
ggplot(artists_yt, aes(x = TotalViews, y = reorder(Artist, -TotalViews))) +
geom_bar(stat = "identity", fill = "red") + labs(title = "Top 15 most viewed artists on Youtube",
xlabel = "Number of views", y = "Artists") + theme_clean()
artists_yt <- songs_clean %>%
group_by(Artist) %>%
summarize(TotalStreams = sum(Stream)) %>%
arrange(desc(TotalStreams)) %>%
head(15)
# Create a horizontal bar plot
ggplot(artists_yt, aes(x = TotalStreams, y = reorder(Artist, -TotalStreams))) +
geom_bar(stat = "identity", fill = "green") + labs(title = "Top 15 most streamed artists on Spotify",
xlabel = "Number of streams", y = "Artists") + theme_clean()
Observations for variable Track
toppsongs_likes <- songs_clean %>%
arrange(desc(Likes)) %>%
slice_max(Likes, n = 8) %>%
select(Artist, Track, Likes, Danceability, Speechiness, Acousticness,
Instrumentalness, Liveness, Valence, Tempo, Energy)
# Create a bar plot
ggplot(toppsongs_likes, aes(x = Track, y = Likes, fill = Artist)) + geom_bar(stat = "identity",
position = "dodge") + labs(title = "Top 5 Liked Songs", x = "Track",
y = "Number of Likes") + scale_fill_manual(values = rep("#A52A2A",
nrow(toppsongs_likes))) + theme_clean() + theme(axis.text.x = element_text(size = 8,
angle = 45, hjust = 1))
unique_artists <- unique(toppsongs_likes$Artist)
artist_colors <- rainbow(length(unique_artists))
# Create a named vector with artist-color mapping
color_mapping <- setNames(artist_colors, unique_artists)
# Create a bar plot
ggplot(toppsongs_likes, aes(x = Track, y = Likes, fill = Artist)) + geom_bar(stat = "identity",
position = "dodge") + labs(title = "Top 5 Liked Songs", x = "Track",
y = "Number of Likes") + scale_fill_manual(values = color_mapping) +
theme_clean() + theme(axis.text.x = element_text(size = 8, angle = 45,
hjust = 1))
Relationship between top songs and other variables
p1 <- ggplot(toppsongs_likes, aes(x = Track, y = Danceability, fill = Track)) +
geom_bar(stat = "identity") + labs(title = "Danceability of YouTube top tracks",
x = "") + theme_minimal() + theme(axis.text.x = element_text(angle = 20,
hjust = 1)) + guides(fill = FALSE)
p2 <- ggplot(toppsongs_likes, aes(x = Track, y = Energy, fill = Track)) +
geom_bar(stat = "identity") + labs(title = "Energy of YouTube top tracks",
x = "") + theme_minimal() + theme(axis.text.x = element_text(angle = 20,
hjust = 1)) + guides(fill = FALSE)
p3 <- ggplot(toppsongs_likes, aes(x = Track, y = Speechiness, fill = Track)) +
geom_bar(stat = "identity") + labs(title = "Speechiness of YouTube top tracks",
x = "") + theme_minimal() + theme(axis.text.x = element_text(angle = 20,
hjust = 1)) + guides(fill = FALSE)
p4 <- ggplot(toppsongs_likes, aes(x = Track, y = Valence, fill = Track)) +
geom_bar(stat = "identity") + labs(title = "Valence of YouTube top tracks",
x = "") + theme_minimal() + theme(axis.text.x = element_text(angle = 20,
hjust = 1)) + guides(fill = FALSE)
p5 <- ggplot(toppsongs_likes, aes(x = Track, y = Acousticness, fill = Track)) +
geom_bar(stat = "identity") + labs(title = "Acousticness of YouTube top tracks",
x = "") + theme_minimal() + theme(axis.text.x = element_text(angle = 20,
hjust = 1)) + guides(fill = FALSE)
p6 <- ggplot(toppsongs_likes, aes(x = Track, y = Liveness, fill = Track)) +
geom_bar(stat = "identity") + labs(title = "Liveness of YouTube top tracks",
x = "") + theme_minimal() + theme(axis.text.x = element_text(angle = 20,
hjust = 1)) + guides(fill = FALSE)
print(p1)
print(p2)
print(p3)
print(p4)
print(p5)
print(p6)
Observations for the variable valence
ggplot(songs_clean, aes(x = Valence, y = Likes)) + geom_point(alpha = 1/10) +
geom_smooth(method = "lm", se = FALSE, color = "blue") + labs(title = "Scatter Plot of Likes vs Valence",
x = "Valence", y = "Likes") + ylim(0, 10000000) + theme_clean()
Finding the top channels on Youtube
library(ggplot2)
# Filter data to exclude the specific level
filtered_data <- subset(songs_clean, channel_factor != "channel_to_exclude")
# Create a bar plot
ggplot(filtered_data, aes(x = channel_factor)) + geom_bar(fill = "#A52A2A") +
labs(title = "Count of Tracks for Each Channel Factor ", x = "Channel Factor",
y = "Count") + ylim(0, 250) + theme(axis.text.x = element_text(size = 8,
angle = 45, hjust = 1))
theme_minimal()
## List of 97
## $ line :List of 6
## ..$ colour : chr "black"
## ..$ linewidth : num 0.5
## ..$ linetype : num 1
## ..$ lineend : chr "butt"
## ..$ arrow : logi FALSE
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_line" "element"
## $ rect :List of 5
## ..$ fill : chr "white"
## ..$ colour : chr "black"
## ..$ linewidth : num 0.5
## ..$ linetype : num 1
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_rect" "element"
## $ text :List of 11
## ..$ family : chr ""
## ..$ face : chr "plain"
## ..$ colour : chr "black"
## ..$ size : num 11
## ..$ hjust : num 0.5
## ..$ vjust : num 0.5
## ..$ angle : num 0
## ..$ lineheight : num 0.9
## ..$ margin : 'margin' num [1:4] 0points 0points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : logi FALSE
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ title : NULL
## $ aspect.ratio : NULL
## $ axis.title : NULL
## $ axis.title.x :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 2.75points 0points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.title.x.top :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 0
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 2.75points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.title.x.bottom : NULL
## $ axis.title.y :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : num 90
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 2.75points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.title.y.left : NULL
## $ axis.title.y.right :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 0
## ..$ angle : num -90
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 0points 2.75points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : chr "grey30"
## ..$ size : 'rel' num 0.8
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.x :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 2.2points 0points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.x.top :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : num 0
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 2.2points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.x.bottom : NULL
## $ axis.text.y :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 1
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 2.2points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.text.y.left : NULL
## $ axis.text.y.right :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 0points 2.2points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ axis.ticks : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ axis.ticks.x : NULL
## $ axis.ticks.x.top : NULL
## $ axis.ticks.x.bottom : NULL
## $ axis.ticks.y : NULL
## $ axis.ticks.y.left : NULL
## $ axis.ticks.y.right : NULL
## $ axis.ticks.length : 'simpleUnit' num 2.75points
## ..- attr(*, "unit")= int 8
## $ axis.ticks.length.x : NULL
## $ axis.ticks.length.x.top : NULL
## $ axis.ticks.length.x.bottom: NULL
## $ axis.ticks.length.y : NULL
## $ axis.ticks.length.y.left : NULL
## $ axis.ticks.length.y.right : NULL
## $ axis.line : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ axis.line.x : NULL
## $ axis.line.x.top : NULL
## $ axis.line.x.bottom : NULL
## $ axis.line.y : NULL
## $ axis.line.y.left : NULL
## $ axis.line.y.right : NULL
## $ legend.background : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ legend.margin : 'margin' num [1:4] 5.5points 5.5points 5.5points 5.5points
## ..- attr(*, "unit")= int 8
## $ legend.spacing : 'simpleUnit' num 11points
## ..- attr(*, "unit")= int 8
## $ legend.spacing.x : NULL
## $ legend.spacing.y : NULL
## $ legend.key : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ legend.key.size : 'simpleUnit' num 1.2lines
## ..- attr(*, "unit")= int 3
## $ legend.key.height : NULL
## $ legend.key.width : NULL
## $ legend.text :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : 'rel' num 0.8
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ legend.text.align : NULL
## $ legend.title :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ legend.title.align : NULL
## $ legend.position : chr "right"
## $ legend.direction : NULL
## $ legend.justification : chr "center"
## $ legend.box : NULL
## $ legend.box.just : NULL
## $ legend.box.margin : 'margin' num [1:4] 0cm 0cm 0cm 0cm
## ..- attr(*, "unit")= int 1
## $ legend.box.background : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ legend.box.spacing : 'simpleUnit' num 11points
## ..- attr(*, "unit")= int 8
## $ panel.background : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ panel.border : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ panel.spacing : 'simpleUnit' num 5.5points
## ..- attr(*, "unit")= int 8
## $ panel.spacing.x : NULL
## $ panel.spacing.y : NULL
## $ panel.grid :List of 6
## ..$ colour : chr "grey92"
## ..$ linewidth : NULL
## ..$ linetype : NULL
## ..$ lineend : NULL
## ..$ arrow : logi FALSE
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_line" "element"
## $ panel.grid.major : NULL
## $ panel.grid.minor :List of 6
## ..$ colour : NULL
## ..$ linewidth : 'rel' num 0.5
## ..$ linetype : NULL
## ..$ lineend : NULL
## ..$ arrow : logi FALSE
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_line" "element"
## $ panel.grid.major.x : NULL
## $ panel.grid.major.y : NULL
## $ panel.grid.minor.x : NULL
## $ panel.grid.minor.y : NULL
## $ panel.ontop : logi FALSE
## $ plot.background : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ plot.title :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : 'rel' num 1.2
## ..$ hjust : num 0
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 5.5points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ plot.title.position : chr "panel"
## $ plot.subtitle :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : num 0
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 0points 0points 5.5points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ plot.caption :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : 'rel' num 0.8
## ..$ hjust : num 1
## ..$ vjust : num 1
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 5.5points 0points 0points 0points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ plot.caption.position : chr "panel"
## $ plot.tag :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : 'rel' num 1.2
## ..$ hjust : num 0.5
## ..$ vjust : num 0.5
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ plot.tag.position : chr "topleft"
## $ plot.margin : 'margin' num [1:4] 5.5points 5.5points 5.5points 5.5points
## ..- attr(*, "unit")= int 8
## $ strip.background : list()
## ..- attr(*, "class")= chr [1:2] "element_blank" "element"
## $ strip.background.x : NULL
## $ strip.background.y : NULL
## $ strip.clip : chr "inherit"
## $ strip.placement : chr "inside"
## $ strip.text :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : chr "grey10"
## ..$ size : 'rel' num 0.8
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : 'margin' num [1:4] 4.4points 4.4points 4.4points 4.4points
## .. ..- attr(*, "unit")= int 8
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ strip.text.x : NULL
## $ strip.text.x.bottom : NULL
## $ strip.text.x.top : NULL
## $ strip.text.y :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : num -90
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ strip.text.y.left :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : num 90
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi TRUE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## $ strip.text.y.right : NULL
## $ strip.switch.pad.grid : 'simpleUnit' num 2.75points
## ..- attr(*, "unit")= int 8
## $ strip.switch.pad.wrap : 'simpleUnit' num 2.75points
## ..- attr(*, "unit")= int 8
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi TRUE
## - attr(*, "validate")= logi TRUE
Version 1 - low Adjusted R-squared: 0.04122 - Variables that stood out in accordance to p-value: Danceability, LicensedTrue, Instrumentalness, channel_factorSMTOWN, channel_factorSonyMusicSouthVEVO - model needs to be modified
lin_mod1 <- lm(Likes ~ Danceability + Duration_ms + channel_factor + Licensed +
Album_type + Instrumentalness, data = songs_train)
summary(lin_mod1)
##
## Call:
## lm(formula = Likes ~ Danceability + Duration_ms + channel_factor +
## Licensed + Album_type + Instrumentalness, data = songs_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -673522 -170463 -95007 71641 1181076
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -67126.28162 65381.45034 -1.027
## Danceability 140543.28742 15676.54422 8.965
## Duration_ms 0.04799 0.01708 2.809
## channel_factorDisneyMusicVEVO 290960.37318 83975.55647 3.465
## channel_factorRHINO -5009.33848 77770.54604 -0.064
## channel_factorSMTOWN 637360.63880 99318.31030 6.417
## channel_factorSony Music India 336793.67969 85896.01870 3.921
## channel_factorSonyMusicIndiaVEVO 303118.88719 73782.78287 4.108
## channel_factorSonyMusicSouthVEVO 95329.11116 73628.94574 1.295
## channel_factorT-Series 393964.17391 69850.14140 5.640
## channel_factorYRF 510288.46290 84354.31456 6.049
## channel_factorZee Music Company 406853.27244 80016.55207 5.085
## channel_factorOther 100400.92147 64382.09563 1.559
## LicensedTrue 132378.98162 5205.13043 25.432
## Album_typecompilation -21412.71857 12897.58009 -1.660
## Album_typesingle -5722.84529 5840.48220 -0.980
## Instrumentalness -117439.80683 12845.04276 -9.143
## Pr(>|t|)
## (Intercept) 0.304587
## Danceability < 0.0000000000000002 ***
## Duration_ms 0.004973 **
## channel_factorDisneyMusicVEVO 0.000532 ***
## channel_factorRHINO 0.948643
## channel_factorSMTOWN 0.000000000144 ***
## channel_factorSony Music India 0.000088662986 ***
## channel_factorSonyMusicIndiaVEVO 0.000040113819 ***
## channel_factorSonyMusicSouthVEVO 0.195439
## channel_factorT-Series 0.000000017353 ***
## channel_factorYRF 0.000000001495 ***
## channel_factorZee Music Company 0.000000373594 ***
## channel_factorOther 0.118914
## LicensedTrue < 0.0000000000000002 ***
## Album_typecompilation 0.096896 .
## Album_typesingle 0.327175
## Instrumentalness < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 272900 on 12766 degrees of freedom
## Multiple R-squared: 0.09606, Adjusted R-squared: 0.09493
## F-statistic: 84.79 on 16 and 12766 DF, p-value: < 0.00000000000000022
plot_model(lin_mod1)
# Generating predictions for our model
preds_train <- predict(lin_mod1)
preds_train <- predict(lin_mod1, newdata = songs_train)
preds_test <- predict(lin_mod1, newdata = songs_test)
# Calculating RMSE in the test and training sets
get_rmse <- function(true, predictions) {
sqrt(mean((true - predictions)^2))
}
get_rmse(songs_train$Likes, preds_train)
## [1] 272678.9
get_rmse(songs_test$Likes, preds_test)
## [1] 278304
Version 2 - removing channel factor as a variable did not bring much change instead lowered the Adjusted R squared value
lin_mod2 <- lm(Likes ~ Danceability + Duration_ms + Licensed + Album_type +
Instrumentalness, data = songs_train)
summary(lin_mod2)
##
## Call:
## lm(formula = Likes ~ Danceability + Duration_ms + Licensed +
## Album_type + Instrumentalness, data = songs_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -480494 -176616 -97802 75078 1183211
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 31589.83073 11849.11819 2.666
## Danceability 137233.51423 15857.78031 8.654
## Duration_ms 0.05585 0.01725 3.237
## LicensedTrue 140892.38337 5228.52666 26.947
## Album_typecompilation -5677.87940 12923.90664 -0.439
## Album_typesingle 1943.83779 5884.33464 0.330
## Instrumentalness -125658.16863 12988.41266 -9.675
## Pr(>|t|)
## (Intercept) 0.00769 **
## Danceability < 0.0000000000000002 ***
## Duration_ms 0.00121 **
## LicensedTrue < 0.0000000000000002 ***
## Album_typecompilation 0.66043
## Album_typesingle 0.74115
## Instrumentalness < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 276200 on 12776 degrees of freedom
## Multiple R-squared: 0.07319, Adjusted R-squared: 0.07275
## F-statistic: 168.1 on 6 and 12776 DF, p-value: < 0.00000000000000022
plot_model(lin_mod2)
# Generating predictions for our model
preds_train <- predict(lin_mod2)
preds_train <- predict(lin_mod2, newdata = songs_train)
preds_test <- predict(lin_mod2, newdata = songs_test)
# Calculating RMSE in the test and training sets
get_rmse <- function(true, predictions) {
sqrt(mean((true - predictions)^2))
}
get_rmse(songs_train$Likes, preds_train)
## [1] 276107.5
get_rmse(songs_test$Likes, preds_test)
## [1] 280038.1
Version 3 Semifinal Model
lin_mod3 <- lm(Likes ~ Danceability + Licensed + official_video + Valence +
channel_factor + Loudness + Liveness, data = songs_train)
summary(lin_mod3)
##
## Call:
## lm(formula = Likes ~ Danceability + Licensed + official_video +
## Valence + channel_factor + Loudness + Liveness, data = songs_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -671382 -180058 -85165 73167 1227352
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -2374.1 65061.7 -0.036
## Danceability 137307.0 17062.0 8.048
## LicensedTrue 50787.1 8945.0 5.678
## official_videoTrue 103223.3 9772.6 10.563
## Valence -61546.8 11214.7 -5.488
## channel_factorDisneyMusicVEVO 296863.5 82882.3 3.582
## channel_factorRHINO 24190.8 77089.1 0.314
## channel_factorSMTOWN 629730.5 98373.7 6.401
## channel_factorSony Music India 350448.0 85059.6 4.120
## channel_factorSonyMusicIndiaVEVO 331594.0 73104.1 4.536
## channel_factorSonyMusicSouthVEVO 123329.7 72943.7 1.691
## channel_factorT-Series 403857.3 69149.2 5.840
## channel_factorYRF 545792.2 83603.1 6.528
## channel_factorZee Music Company 415664.3 79196.5 5.249
## channel_factorOther 115254.5 63780.5 1.807
## Loudness 7821.7 565.9 13.821
## Liveness -26734.7 14482.8 -1.846
## Pr(>|t|)
## (Intercept) 0.970892
## Danceability 0.000000000000000919 ***
## LicensedTrue 0.000000013950030670 ***
## official_videoTrue < 0.0000000000000002 ***
## Valence 0.000000041418305336 ***
## channel_factorDisneyMusicVEVO 0.000343 ***
## channel_factorRHINO 0.753676
## channel_factorSMTOWN 0.000000000159335363 ***
## channel_factorSony Music India 0.000038121558777772 ***
## channel_factorSonyMusicIndiaVEVO 0.000005787725596645 ***
## channel_factorSonyMusicSouthVEVO 0.090908 .
## channel_factorT-Series 0.000000005335163418 ***
## channel_factorYRF 0.000000000069004181 ***
## channel_factorZee Music Company 0.000000155784643841 ***
## channel_factorOther 0.070778 .
## Loudness < 0.0000000000000002 ***
## Liveness 0.064921 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 270200 on 12766 degrees of freedom
## Multiple R-squared: 0.1134, Adjusted R-squared: 0.1123
## F-statistic: 102.1 on 16 and 12766 DF, p-value: < 0.00000000000000022
# Plotting the coefficients with the standard error
tab_model(lin_mod3)
| Â | Likes | ||
|---|---|---|---|
| Predictors | Estimates | CI | p |
| (Intercept) | -2374.12 | -129904.75 – 125156.50 | 0.971 |
| Danceability | 137306.95 | 103862.84 – 170751.07 | <0.001 |
| Licensed [True] | 50787.06 | 33253.48 – 68320.64 | <0.001 |
| official video [True] | 103223.26 | 84067.55 – 122378.97 | <0.001 |
| Valence | -61546.82 | -83529.37 – -39564.27 | <0.001 |
|
channel factor [DisneyMusicVEVO] |
296863.50 | 134401.73 – 459325.27 | <0.001 |
| channel factor [RHINO] | 24190.80 | -126915.37 – 175296.96 | 0.754 |
| channel factor [SMTOWN] | 629730.51 | 436903.32 – 822557.71 | <0.001 |
|
channel factor [Sony Music India] |
350447.95 | 183718.40 – 517177.50 | <0.001 |
|
channel factor [SonyMusicIndiaVEVO] |
331594.02 | 188298.98 – 474889.06 | <0.001 |
|
channel factor [SonyMusicSouthVEVO] |
123329.74 | -19650.81 – 266310.30 | 0.091 |
| channel factor [T-Series] | 403857.29 | 268314.51 – 539400.07 | <0.001 |
| channel factor [YRF] | 545792.18 | 381917.58 – 709666.79 | <0.001 |
|
channel factor [Zee Music Company] |
415664.34 | 260427.33 – 570901.36 | <0.001 |
| channel factor [Other] | 115254.47 | -9764.90 – 240273.84 | 0.071 |
| Loudness | 7821.72 | 6712.44 – 8931.00 | <0.001 |
| Liveness | -26734.69 | -55123.17 – 1653.79 | 0.065 |
| Observations | 12783 | ||
| R2 / R2 adjusted | 0.113 / 0.112 | ||
plot_model(lin_mod3)
tidy(lin_mod3)
Version 4
lin_mod4 <- lm(Likes ~ Danceability + Licensed + official_video + Valence +
Loudness + Liveness, data = songs_train)
summary(lin_mod4)
##
## Call:
## lm(formula = Likes ~ Danceability + Licensed + official_video +
## Valence + Loudness + Liveness, data = songs_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -339295 -185833 -86709 73430 1230271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 116774.8 14138.8 8.259 < 0.0000000000000002
## Danceability 134917.0 17263.7 7.815 0.00000000000000592
## LicensedTrue 59358.1 9034.5 6.570 0.00000000005221201
## official_videoTrue 102896.3 9894.3 10.400 < 0.0000000000000002
## Valence -60256.8 11337.6 -5.315 0.00000010857173525
## Loudness 8055.1 572.3 14.075 < 0.0000000000000002
## Liveness -32176.1 14649.0 -2.196 0.0281
##
## (Intercept) ***
## Danceability ***
## LicensedTrue ***
## official_videoTrue ***
## Valence ***
## Loudness ***
## Liveness *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273600 on 12776 degrees of freedom
## Multiple R-squared: 0.09042, Adjusted R-squared: 0.08999
## F-statistic: 211.7 on 6 and 12776 DF, p-value: < 0.00000000000000022
plot_model(lin_mod4)
# Generating predictions for our model
preds_train <- predict(lin_mod4)
preds_train <- predict(lin_mod4, newdata = songs_train)
preds_test <- predict(lin_mod4, newdata = songs_test)
# Calculating RMSE in the test and training sets
get_rmse <- function(true, predictions) {
sqrt(mean((true - predictions)^2))
}
get_rmse(songs_train$Likes, preds_train)
## [1] 273528.1
get_rmse(songs_test$Likes, preds_test)
## [1] 276688
# Generating predictions for our model
preds_train <- predict(lin_mod3)
preds_train <- predict(lin_mod3, newdata = songs_train)
preds_test <- predict(lin_mod3, newdata = songs_test)
# Calculating RMSE in the test and training sets
get_rmse <- function(true, predictions) {
sqrt(mean((true - predictions)^2))
}
get_rmse(songs_train$Likes, preds_train)
## [1] 270048.8
get_rmse(songs_test$Likes, preds_test)
## [1] 274708.6
# Generating prediction/true plots
results_train <- tibble(preds = preds_train, true = songs_train$Likes,
type = "train")
results_test <- tibble(preds = preds_test, true = songs_test$Likes, type = "test")
results_df <- bind_rows(results_train, results_test)
ggplot(results_df, aes(x = true, y = preds)) + geom_point(aes(color = type),
alpha = 1/10) + geom_abline(color = "purple") + facet_wrap(~type) +
xlim(0, 2500000) + ylim(0, 2500000) + theme_clean(base_size = 8) +
theme(legend.position = "bottom")
# Calculate MAE in the test and training sets
get_medae <- function(true, predictions) {
median(abs(true - predictions))
}
get_medae(results_test$true, results_test$preds)
## [1] 154525.3
get_medae(results_train$true, results_train$preds)
## [1] 154287.3
get_mae <- function(true, predictions) {
mean(abs(true - predictions))
}
get_mae(results_test$true, results_test$preds)
## [1] 199940.9
get_mae(results_train$true, results_train$preds)
## [1] 197787.5
used to predict which variables to use
songs_lasso <- songs_clean %>%
select(-Artist, -Channel, -Track, -Album, -Title, -Stream, -Comments,
-Views)
lasso_fit1 <- cv.glmnet(Likes ~ ., data = songs_lasso, alpha = 1)
print(coef(lasso_fit1, s = "lambda.min"))
## 34 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -732312.4826031260891
## Album_typealbum -14961.2220563282881
## Album_typecompilation .
## Album_typesingle 1327.6253007368950
## Danceability 235220.9792014449195
## Energy 9526.1836990412248
## Key 358.6198671223917
## Loudness -1386.4072336047909
## Speechiness 138551.5907292820921
## Acousticness 18180.6517090925699
## Instrumentalness 30992.5007054814087
## Liveness 17926.2984514284108
## Valence -14693.7927158808579
## Tempo 177.2259294922730
## Duration_ms 0.0012397169376
## LicensedFalse -22794.3923026303673
## LicensedTrue 0.0000004354782
## official_videoFalse 22923.0175054061292
## official_videoTrue -0.0000004836577
## popular0 -155050.8781290832849
## popular1 .
## channel_factorAtlantic Records -121694.3576162397803
## channel_factorDisneyMusicVEVO .
## channel_factorRHINO -123071.1359877009527
## channel_factorSMTOWN 355735.8565067854943
## channel_factorSony Music India 113654.8131740860845
## channel_factorSonyMusicIndiaVEVO 101538.4225713686028
## channel_factorSonyMusicSouthVEVO -56537.8223991831910
## channel_factorT-Series 145847.2418499816558
## channel_factorYRF 241761.2599059541535
## channel_factorZee Music Company 147761.9635412363859
## channel_factorOther -58058.1173608884637
## log_likes 80571.1806766767550
## log_dance -96064.3879771835927